package com.hanshg.cherry.task.crawler;

import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @ClassName CrawlerJob
 * @Description TODO
 * @Author 柠檬水
 * @Date 2020/4/23 17:12
 * @Version 1.0
 **/
@Component
@Slf4j
public class CrawlerJob {

    private String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";

    @Autowired
    private CrawlerPipeline crawlerPipeline;

    @Autowired
    private CrawlerPageProcessor crawlerPageProcessor;

    /**
     * 爬虫工作
     */
    public void startJob() {
        try {
            List<Proxy> proxies = buildProxyIP();
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            httpClientDownloader.setProxyProvider(new SimpleProxyProvider(proxies));

            Spider spider = Spider.create(crawlerPageProcessor)
                    .addUrl(url)
                    .addPipeline(crawlerPipeline)
                    // 添加布隆过滤器
                    // .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100*10000)))
                    // Redis过滤器
                    // .setScheduler(new RedisScheduler(new JedisPool("127.0.0.1")))
                    // 设置代理
                    // .setDownloader(httpClientDownloader)
                    .thread(5);
            spider.start(); //异步执行
        } catch (Exception ex) {
            log.error("定时抓取51job招聘数据线程执行异常", ex);
        }

    }

    /**
     * 不错的免费代理IP站点
     * www.89ip.cn
     *
     * @return
     */
    private static List<Proxy> buildProxyIP() throws IOException {
        Document parse = Jsoup.parse(new URL("http://www.89ip.cn/tqdl.html?api=1&num=60&port=&address=&isp="), 5000);
        String pattern = "(\\d+)\\.(\\d+)\\.(\\d+)\\.(\\d+):(\\d+)";
        Pattern r = Pattern.compile(pattern);
        Matcher m = r.matcher(parse.toString());
        List<Proxy> proxies = new ArrayList<>();
        while (m.find()) {
            String[] group = m.group().split(":");
            int prot = Integer.parseInt(group[1]);
            proxies.add(new Proxy(group[0], prot));
        }
        return proxies;
    }
}
